library(tidyverse)
library(data.table)
library(here)
library(kableExtra)
library(tidytext)
library(DT)
data_path <- "C:/Users/goutsmedt/Documents/MEGAsync/Research/R/projets/data/green_ecb_responsiveness"
source(here(path.expand("~"), "green_ecb", "function", "functions_for_topic_modelling.R"))
K = 120

# load the topics stats and gamma attributes
lda <- readRDS(here(data_path, "topic_modelling", paste0("LDA_", K, ".rds")))
lda_data <- readRDS(here(data_path, 
             "topic_modelling",
             paste0("LDA_", K, "_data.rds"))) %>% 
  as.data.table() %>% 
  .[, period := case_when(between(date, "1998-11-20", "2011-11-08") ~ "Period_1",
                            between(date, "2011-11-08", "2021-09-01") ~ "Period_2",
                            between(date, "2021-09-01", "2023-02-01") ~ "Period_3")] 

lda_proximity <- readRDS(here(data_path, 
                                     "topic_modelling",
                                     "similarities_LDA.rds"))

topics_to_look <- lda_proximity %>% 
  filter(rank <= 5 | topic %in% c(114, 21, 91, 103),
         ! topic %in% c(78, 76, 115, 43, 74, 109, 56, 61)) %>% 
  distinct(topic)

inflation_topics <- lda_data %>% 
  filter(inflation_topic == TRUE) %>% 
  distinct(topic)

topics_to_look <- bind_rows(inflation_topics, topics_to_look) %>% 
  mutate(rank = row_number())

data_year_subset <- lda_data %>% 
  filter(! is.na(period)) %>% 
  .[,`:=` (mean = mean(gamma),
           st_err = sd(gamma)/sqrt(length(gamma))), by = .(topic, period)] %>%
  .[order(period, desc(mean)),] %>% 
  distinct(topic, topic_name, inflation_topic, period, mean, st_err) %>% 
  .[, rank := 1:.N, by = period] %>% 
  pivot_wider(names_from = "period", values_from = c("mean", "st_err", "rank")) # %>% 
 # mutate(differential = mean_Period_2 - mean_Period_1)

topics_per_speech <- lda_data %>%  
  .[, gamma_speech := mean(gamma), by = .(topic, file)] %>% 
  select(topic, file, title, year, date, speaker_cleaned, gamma_speech, pdf_link, period) %>% 
  unique()
# Calculate top frex and lift value for the topic 
beta_lda <- tidy(lda, matrix = "beta") %>% 
  group_by(topic) %>% 
  slice_max(order_by = beta, n = 15, with_ties = FALSE) %>% 
  mutate(rank_beta = 1:n()) %>% 
  select(topic, term_beta = term, rank_beta, beta)

frex_lda <- calculate_frex(lda, 15, 0.5, topic_method = "LDA") %>% 
  group_by(topic) %>% 
  slice_max(order_by = frex, n = 15, with_ties = FALSE) %>% 
  ungroup() %>% 
  select(term_frex = term, rank_frex = rank, frex)

lda_words <- beta_lda %>% 
  bind_cols(frex_lda)
 
# Most representative speech
top_speech_paragraphs <- lda_data %>% 
  select(topic, document_id, title, date, speaker_cleaned, period, pdf_link, paragraphs, gamma) %>% 
#  filter(central_bank %in% cb_focus) %>% 
  group_by(topic) %>% 
  slice_max(gamma, n = 10, with_ties = FALSE) %>% 
  mutate(title_link = paste0("[", title, "](", pdf_link, ")"),
         paragraphs = str_trunc(paragraphs, 800, "right") %>% str_squish(),
         gamma = round(gamma, 3)) %>% 
  ungroup()

top_speech <- topics_per_speech %>% 
  select(topic, file, title, date, speaker_cleaned, period, pdf_link, gamma_speech) %>% 
#  filter(central_bank %in% cb_focus) %>% 
  group_by(topic) %>% 
  slice_max(gamma_speech, n = 15, with_ties = FALSE) %>% 
  mutate(title_link = paste0("[", title, "](", pdf_link, ")"),
         gamma_speech = round(gamma_speech, 3)) %>% 
  ungroup()

# Most representative speech per period
top_speech_paragraphs_period <- lda_data %>% 
  select(topic, document_id, title, date, speaker_cleaned, period, pdf_link, paragraphs, period, gamma) %>% 
  filter(! is.na(period)) %>% 
  group_by(period, topic) %>% 
  slice_max(gamma, n = 3, with_ties = FALSE) %>% 
  mutate(title_link = paste0("[", title, "](", pdf_link, ")"),
         paragraphs = str_trunc(paragraphs, 800, "right") %>% str_squish(),
         gamma = round(gamma, 3)) %>% 
  ungroup()

top_speech_period <- topics_per_speech %>% 
  select(topic, file, title, date, speaker_cleaned, period, pdf_link, gamma_speech, period) %>% 
  filter(! is.na(period)) %>% 
  group_by(period, topic) %>% 
  slice_max(gamma_speech, n = 5, with_ties = FALSE) %>% 
  mutate(title_link = paste0("[", title, "](", pdf_link, ")"),
         gamma_speech = round(gamma_speech, 3)) %>% 
  ungroup()

# ordering topics

list_topics <- data_year_subset %>% 
  mutate(prevalence = mean_Period_1 + mean_Period_2 + mean_Period_3) %>% 
  filter(topic %in% topics_to_look$topic) %>% 
  arrange(desc(inflation_topic), desc(prevalence)) %>% 
  mutate(topic_name = paste0("Topic ", topic, ": ", topic_name),
         rank = row_number())

1 Introduction

Our article seeks to understand the transformations in ECB’s framing of inflation issue between 1998 and early 2023. To observe these transformations, we use topic modeling on a corpus of ECB policymakers’ speeches.

Topic modeling is a method used to uncover hidden themes (the topics) in a large corpus of text data. It is an “unsupervised” method that automatically identifies structures and categories in an unstructured corpus. This technical appendix provides all the details on our implementation and use of this method.

2 Step 1: Creating the corpus

Our corpus is compose of the speeches of ECB’s board members between 1998-11-20 and 2023-02-01 listed on the Bank of International Settlements website. We have used tesseract (Ooms 2022) for Optical Character Recognition on some speeches for which recognition was not good. Thanks notably to R packages Tidytext (Silge and Robinson 2016) and tokenizers (Mullen et al. 2018), we divide each speech in a sequence of paragraphs. We remove bibliography paragraphs as well as paragraph with acknowledgements.

We want to focus on speeches that deal substantially with inflation. We thus decide to keep only the speeches that mention a certain number of times words containing “inflation” (“inflation”, “inflationary”, “disinflation” etc.). To take into account the length of a speech, we divide the frequency of “inflation” words by the speech number of pages. We test three threshold under which we remove a speech:

  • a small threshold: in average, more than one occurrence per page of “inflation” in the speech;
  • a medium threshold: in average, more than one and half occurrence per page;
  • a large threshold: in average, more than 2 occurrences per page.

Here is the number of speeches in the corpus depending on the threshold we use:

knitr::include_graphics(here::here("pictures", glue::glue("threshold_corpus_absolute.png")))

We may also look at the share of speeches in the corpus we are keeping depending on the threshold we use:

knitr::include_graphics(here::here("pictures", glue::glue("threshold_corpus_share.png")))

We see that it leads to remove a lot of speeches between 2009 and 2013, a sign that inflation was a less important issue at this time. But the differences between the different thresholds are not very large in average. We decide to take the more restrictive threshold (2 * number of pages), which gives us 817 speeches. This choice gives us a sufficiently large corpus to have a representative sample of speeches on inflation, while avoiding to integrate speeches in which inflation is not so central.

3 Step 2: Choosing the appropriate topic model

3.1 Step 2.a: Pre-processing of the corpus

Texts are tokenized with Silge and Robinson (2016) and Mullen et al. (2018). We keep unigrams (like “price”), bigrams (“price stability”) and trigrams (“maintain price stability”. The corpus is organised in paragraphs: the documents in the topic modelling are the 6818 paragraphs of the 817 speeches. This allows (i) for a more fine-grained understanding of what the whole speech is about as well as (ii) to measure more accurately correlation between topics, at the paragraph level.

We remove the words in the stopwords lists “nltk” and “iso” implemented in R packages stopwords (Benoit, Muhr, and Watanabe 2021). These are large list of stopwords, allowing us to remove unnecessary words in our analysis.1. We lemmatize each word using the dictionary incorporated in textstem (Rinker 2018).

3.2 Step 2.b: Evaluating different models

3.2.1 Latent Dirichlet Allocation

To run our topic model, we use the usual Latent Dirichlet Allocation (LDA), which is a probabilistic generative model employed in machine learning to detect topics present in a collection of a document. It presupposes that each document comprises a blend of a limited number of concealed topics, with each word in the document generated by one of those topics.

The LDA model characterizes each document as a distribution of topics, wherein each topic is a distribution over words. It uses Bayesian inference to estimate the probability distribution of topics and words within each document, as well as the overall distribution of topics in the entire collection.

The algorithm encompasses three steps:

  1. initializing the topic and word distributions
  2. iteratively allocating words in each document to topics based on their likelihood of belonging to each topic
  3. updating the topic and word distributions according to the assignments. The process persists until convergence is achieved, at which point the topic and word distributions are utilized to determine the most probable topics within each document

3.2.2 Testing Different Models

Denny and Spirling (2018) have shown that pre-processing steps, and notably filtering of rare words, may have a large impact on the results of topic modelling. For this reason, we decide to test three different threshold, by removing all the ngrams which appear less than 5, 10 or 20 times. This results in a different vocabulary. We run our LDA model for each of these vocabulary lists.

We also test different models for different number of topics, from 30 topics to 160, going 10 by 10. For each of our three vocabulary filters, we thus run 14 models with a different number of topics, for a total of 42 models. We use bot quantitative and qualitative approaches to choose our filtering threshold and the number of topics.

First, we perform 4 quantitative metrics implemented in Nikita (2020) for our different models. Two metrics inspired by Arun et al. (2010) and Cao et al. (2009) has to be minimized; the two others, inspired by Griffiths and Steyvers (2004) and Deveaud, SanJuan, and Bellot (2014) has to be maximized. Here is an interactive figure to observe the results for the different pre-processing methods (method 1 for more 5 occurrences; method 2 for more than 10; and method 3 for more than 20). The crosses indicate the maximized and minimized values.

htmltools::includeHTML(here::here("writing", "tuning_topicmodels.html"))

We select different number of topics for each method, for each the average metrics appear good. The number of topics chosen is indicated by vertical line. We have selected 6 models, with different combinations of pre-processing method and number of topics, to be evaluated qualitatively. The qualitative assessment allows us to evaluate the “interpretability” of our different models. Interpretability in topic modeling refers to the ability for humans to understand and make sense of the topics that are generated by the model. In other words, an interpretable topic model is one that produces topics that are meaningful, coherent, and useful for understanding the underlying structure of the data.

One method for evaluating the interpretability of topic models is called the “intruder” method (Chang et al. 2009). The intruder method involves adding a word that does not belong to a given topic into the list of top words for that topic and evaluating whether the human evaluator can identify the “intruder” word. If the human evaluator can easily identify the intruder word, then the topic is considered to be more interpretable. This method provides a useful way to compare the interpretability of different topic models.

For each topic models, we select randomly 30 topics in which a intruder has to be found. The whole list of topics (with 180 topics) is sampled: there is no way to connect a topic to evaluate with the model it comes from. We reproduce this three times to give to evaluate to three different human coders. The following figure displays the result:

knitr::include_graphics(here::here("pictures", glue::glue("plot_intruder.png")))

After looking to the details of two best topic modelling (55 topics with pre-processing method 2; 120 topics with method 3), it appears that choosing 120 topics allowed for more granularity in the evolution of ECB communication. We thus choose the model with 120 topics and a vocabulary of 5707 words and expressions.

4 Step 3: Analysing the results

4.1 General presentation of the topic model

Two understand our 120 topics, we look at two types of statistics:

  • the top “beta” words, i.e. the list of the top words for each topic. These are the words that have the highest probability of appearing in the documents that are assigned to that topic. Examining the top words can give a sense of the general theme or subject matter of the topic.
  • the FREX value (Bischof and Airoldi 2012) is a metric for evaluating the quality of keywords within a topic. It takes into account both the frequency of a given word within a topic and the exclusivity of that word to that topic (i.e. the degree to which the word is used almost exclusively within that topic).

These two sets of keywords allows to distinguish between 14 topics on inflation and price stability and other topics. The 14 are the topics which have “inflation” or “price” in their 5 top beta words.

Here is the list of the 120 topics and of their prevalence across the three periods.

lda_words_collapsed <- lda_words %>% 
  filter(rank_beta <= 10) %>% 
  select(-starts_with("rank")) %>% 
  group_by(topic) %>% 
  summarise(across(starts_with("term"), ~str_flatten(., collapse = "; "))) %>%
  left_join(select(data_year_subset, topic, starts_with("mean")))

quants <- quantile(lda_words_collapsed$mean_Period_1)[2:4]
datatable(lda_words_collapsed,
          class = 'cell-border stripe',
          rownames = FALSE) %>% 
  formatStyle(names(lda_words_collapsed)[4:6],
              backgroundColor = styleInterval(c(quants[1], quants[2], quants[3]), c('lightgray', "lightblue", "pink", "red")))

We assess the prevalence per period by averaging the “gamma” values of all paragraphs published in a period with the corresponding topic.2

To obtain a more fine-grained overview of the evoluation of each topic, we can regress the gamma values of each topic on dates. In other words, we take average a set of gamma values for each topic the different dates of speech publication, and we want to create a smooth curve that represents the overall trend of topics over time. We are using an estimation method called “loess” to create a smooth curve that connects those average values. The loess method works by fitting a series of local polynomial regression models to the data, with each model centered on a particular point along the x-axis (in this case, the dates). The resulting curve is a smoothed representation of the data that helps to highlight the overall trend while minimizing the impact of random fluctuations or outliers.3 Here are the evolution of the topics over time:

knitr::include_graphics(here::here("pictures", glue::glue("TM_LDA_topic_per_date.png")))

We use the same method to produce the figures on topics evolution in the article.

We can also zoom on the 14 topics on inflation and price stability.

knitr::include_graphics(here::here("pictures", glue::glue("TM_LDA_main_topic_per_date.png")))

4.2 Calculating similarities

Beyond observing the variation of topics prevalence over time, we also rank the non-inflation topics according to their similarity to the inflation and price stability topics. For each paragraph, we average the gamma values of associated with the 14 topics on inflation. We then compare this vector of average gamma values, with the vector of gamma values for each other topic. For each period, we use two similarity measures:

  • Cosine similarity measures the cosine of the angle between two vectors in a high-dimensional space. In the context of topic modeling, each topic distribution can be seen as a vector in a high-dimensional space, where the dimensions correspond to the different documents of the corpus.
  • Jensen-Shannon divergence, on the other hand, measures the difference between two probability distributions based on their relative entropy or Kullback-Leibler divergence. Unlike cosine similarity, the Jensen-Shannon divergence takes into account the magnitude of the probability distributions (ie the total probability mass or density of the distribution), not just their direction.

We then rank the different topics according to their similarity measures, “1” representing the topic the most correlated with topics on inflation and price stability. The following table displays the correlation ranking of each topic for each period and measures.

proximities <- lda_proximity %>% 
  filter(similarity_measure %in% c("cosine", "jsd")) %>% 
  select(-similarity) %>% 
  pivot_wider(names_from = "similarity_measure",
              values_from = all_of("rank"))

proximities %>% 
  mutate(period = case_match(period,
                             "Period_1" ~ "1998-2011",
                             "Period_2" ~ "2011-2021",
                             "Period_3" ~ "2021-2023")) %>% 
  datatable(class = "cell-border stripe",
          rownames = FALSE) %>% 
  formatStyle(names(proximities)[4:5],
              backgroundColor = styleEqual(c(1:10, 40:120), c(rep('lightblue', 10), rep("lightgray", 81))))

In the article, we use the Jensen-Shannon divergence as we want to take into account the fact that some topics are more prevalent than others (and so have a higher magnitude). Indeed, the cosine measure is based solely on the cosine of the angle between the two vectors representing the topic distributions, and thus does not take into account the magnitudes of the vectors.

Beyond this technical appendix, another document is provided with information on the content of each topic (list of keywords, and most representative paragraphs and speeches).

References

Arun, R., V. Suresh, C. E. Veni Madhavan, and M. N. Narasimha Murthy. 2010. “On Finding the Natural Number of Topics with Latent Dirichlet Allocation: Some Observations.” In Advances in Knowledge Discovery and Data Mining, edited by Mohammed J. Zaki, Jeffrey Xu Yu, B. Ravindran, and Vikram Pudi, 391–402. Lecture Notes in Computer Science. Berlin, Heidelberg: Springer. https://doi.org/10.1007/978-3-642-13657-3_43.
Benoit, Kenneth, David Muhr, and Kohei Watanabe. 2021. Stopwords: Multilingual Stopword Lists. https://CRAN.R-project.org/package=stopwords.
Bischof, Jonathan, and Edoardo M. Airoldi. 2012. “Summarizing Topical Content with Word Frequency and Exclusivity.” In Proceedings of the 29th International Conference on Machine Learning (ICML-12), 201–8.
Cao, Juan, Tian Xia, Jintao Li, Yongdong Zhang, and Sheng Tang. 2009. “A Density-Based Method for Adaptive LDA Model Selection.” Neurocomputing, Advances in Machine Learning and Computational Intelligence, 72 (7): 1775–81. https://doi.org/10.1016/j.neucom.2008.06.011.
Chang, Jonathan, Sean Gerrish, Chong Wang, Jordan L. Boyd-Graber, and David M. Blei. 2009. “Reading Tea Leaves: How Humans Interpret Topic Models.” In Advances in Neural Information Processing Systems, 288–96.
Denny, Matthew J., and Arthur Spirling. 2018. “Text Preprocessing For Unsupervised Learning: Why It Matters, When It Misleads, And What To Do About It.” Political Analysis 26 (2): 168–89. https://doi.org/10.1017/pan.2017.44.
Deveaud, Romain, Eric SanJuan, and Patrice Bellot. 2014. “Accurate and Effective Latent Concept Modeling for Ad Hoc Information Retrieval.” Document Numérique 17 (1): 61–84. https://doi.org/10.3166/dn.17.1.61-84.
Griffiths, Thomas L., and Mark Steyvers. 2004. “Finding Scientific Topics.” Proceedings of the National Academy of Sciences 101 (suppl 1): 5228–35. https://doi.org/10.1073/pnas.0307752101.
Mullen, Lincoln A., Kenneth Benoit, Os Keyes, Dmitry Selivanov, and Jeffrey Arnold. 2018. “Fast, Consistent Tokenization of Natural Language Text.” Journal of Open Source Software 3: 655. https://doi.org/10.21105/joss.00655.
Nikita, Murzintcev. 2020. Ldatuning: Tuning of the Latent Dirichlet Allocation Models Parameters. https://CRAN.R-project.org/package=ldatuning.
Ooms, Jeroen. 2022. Tesseract: Open Source OCR Engine. https://CRAN.R-project.org/package=tesseract.
Rinker, Tyler W. 2018. textstem: Tools for Stemming and Lemmatizing Text. Buffalo, New York. http://github.com/trinker/textstem.
Silge, Julia, and David Robinson. 2016. “Tidytext: Text Mining and Analysis Using Tidy Data Principles in r.” JOSS 1 (3). https://doi.org/10.21105/joss.00037.

  1. Nonetheless, we remove from these stopword lists some relevant words like “research”, “member”, etc.↩︎

  2. The “gamma” value measure the strength of the association between a document (here a speech paragraph) and a topic↩︎

  3. For this local polinomial regression, we use a “span” parameter of 0,2. The “span” parameter is a value used in the loess method to control the amount of smoothing applied to the curve. Specifically, it determines the width of the window of data points used to fit each local polynomial regression model. A larger span value will result in a smoother curve that better captures the overall trend in the data but may miss some of the smaller fluctuations or details. A smaller span value will result in a more jagged curve that captures more of the individual data points.↩︎